knitr::opts_chunk$set(echo = FALSE)
needed_packages <- c("VIM", "tidyverse","pastecs", "ggplot2", "semTools", "psych", "FSA", "car", "effectsize", "coin", "rstatix", "sjstats", "userfriendlyscience", "stats", "foreign", "gmodels", "lm.beta","stargazer", "lmtest", "DescTools", "nnet", "reshape2", "generalhoslem", "Epi", "arm", "regclass", "olsrr","REdaS", "Hmisc","corrplot","ggcorrplot", "factoextra", "nFactors","readxl")
# Extract not installed packages
not_installed <- needed_packages[!(needed_packages %in% installed.packages()[, "Package"])]
# Install not installed packages
if(length(not_installed))
install.packages(not_installed)
library(pastecs) #For creating descriptive statistic summaries
library(ggplot2) #For creating histograms with more detail than plot
library(semTools) #For skewness and kurtosis
## Loading required package: lavaan
## This is lavaan 0.6-7
## lavaan is BETA software! Please report any bugs.
##
## ###############################################################################
## This is semTools 0.5-3
## All users of R (or SEM) are invited to submit functions or ideas for functions.
## ###############################################################################
library(psych) #For descriptive functions
##
## Attaching package: 'psych'
## The following object is masked from 'package:semTools':
##
## skew
## The following object is masked from 'package:lavaan':
##
## cor2cov
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(FSA) #For percentage
## ## FSA v0.8.30. See citation('FSA') if used in publication.
## ## Run fishR() for related website and fishR('IFAR') for related book.
##
## Attaching package: 'FSA'
## The following object is masked from 'package:psych':
##
## headtail
library(car) # For Levene's test for homogeneity of variance and test for colinearity of predictors
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:FSA':
##
## bootCase
## The following object is masked from 'package:psych':
##
## logit
library(effectsize) #To calculate effect size for t-test
##
## Attaching package: 'effectsize'
## The following object is masked from 'package:psych':
##
## phi
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ tibble 3.0.3 ✓ dplyr 1.0.2
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ✓ purrr 0.3.4
## ── Conflicts ────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x psych::%+%() masks ggplot2::%+%()
## x psych::alpha() masks ggplot2::alpha()
## x readr::clipboard() masks semTools::clipboard()
## x tidyr::extract() masks pastecs::extract()
## x dplyr::filter() masks stats::filter()
## x dplyr::first() masks pastecs::first()
## x dplyr::lag() masks stats::lag()
## x dplyr::last() masks pastecs::last()
## x dplyr::recode() masks car::recode()
## x purrr::some() masks car::some()
library(coin) # For Wilcox test (non-parametric)
## Loading required package: survival
library(rstatix) # For calculating effect size
##
## Attaching package: 'rstatix'
## The following objects are masked from 'package:coin':
##
## chisq_test, friedman_test, kruskal_test, sign_test, wilcox_test
## The following objects are masked from 'package:effectsize':
##
## cohens_d, eta_squared
## The following object is masked from 'package:stats':
##
## filter
library(sjstats) #calculate effect size for t-test
## Registered S3 methods overwritten by 'lme4':
## method from
## cooks.distance.influence.merMod car
## influence.merMod car
## dfbeta.influence.merMod car
## dfbetas.influence.merMod car
##
## Attaching package: 'sjstats'
## The following objects are masked from 'package:effectsize':
##
## cohens_f, phi
## The following object is masked from 'package:FSA':
##
## se
## The following object is masked from 'package:psych':
##
## phi
library(userfriendlyscience)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
## Registered S3 methods overwritten by 'ufs':
## method from
## grid.draw.ggProportionPlot userfriendlyscience
## pander.associationMatrix userfriendlyscience
## pander.dataShape userfriendlyscience
## pander.descr userfriendlyscience
## pander.normalityAssessment userfriendlyscience
## print.CramersV userfriendlyscience
## print.associationMatrix userfriendlyscience
## print.confIntOmegaSq userfriendlyscience
## print.confIntV userfriendlyscience
## print.dataShape userfriendlyscience
## print.descr userfriendlyscience
## print.ggProportionPlot userfriendlyscience
## print.meanConfInt userfriendlyscience
## print.multiVarFreq userfriendlyscience
## print.normalityAssessment userfriendlyscience
## print.regrInfluential userfriendlyscience
## print.scaleDiagnosis userfriendlyscience
## print.scaleStructure userfriendlyscience
## print.scatterMatrix userfriendlyscience
##
## Attaching package: 'userfriendlyscience'
## The following objects are masked from 'package:FSA':
##
## is.even, is.odd
## The following object is masked from 'package:semTools':
##
## reliability
library(stats)
library(foreign) # open SPSS file, I may not use that.
library(gmodels) #For creating histograms with more detail than plot
##
## Attaching package: 'gmodels'
## The following object is masked from 'package:sjstats':
##
## ci
library(stargazer)#For formatting outputs/tables for regression
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(lm.beta) # to isolate the beta co-efficients for regression
#Multinomial regression
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(DescTools)
## Registered S3 method overwritten by 'DescTools':
## method from
## reorder.factor gdata
##
## Attaching package: 'DescTools'
## The following object is masked from 'package:car':
##
## Recode
## The following objects are masked from 'package:psych':
##
## AUC, ICC, SD
library(nnet) #Multinomial regression
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
library(generalhoslem) #For test of fit for logistic regression, test assumption of linearity
## Loading required package: reshape
##
## Attaching package: 'reshape'
## The following objects are masked from 'package:reshape2':
##
## colsplit, melt, recast
## The following object is masked from 'package:dplyr':
##
## rename
## The following objects are masked from 'package:tidyr':
##
## expand, smiths
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:rstatix':
##
## select
## The following object is masked from 'package:dplyr':
##
## select
library(Epi) #ROC Curve
library(arm) #for invlogit calculating predicted probabilities
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following object is masked from 'package:reshape':
##
## expand
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loading required package: lme4
##
## Attaching package: 'lme4'
## The following object is masked from 'package:Epi':
##
## factorize
## The following object is masked from 'package:userfriendlyscience':
##
## getData
##
## arm (Version 1.11-2, built: 2020-7-27)
## Working directory is /home/d19125334/Documents/PhD_Modules_2021/ProbilityandStatisiticInference/assignment/assignment2/PSI_Assginment2
##
## Attaching package: 'arm'
## The following object is masked from 'package:effectsize':
##
## standardize
## The following object is masked from 'package:car':
##
## logit
## The following objects are masked from 'package:psych':
##
## logit, rescale, sim
library(regclass) #For confusion matrix
## Loading required package: bestglm
## Loading required package: leaps
## Loading required package: VGAM
## Loading required package: stats4
## Loading required package: splines
##
## Attaching package: 'VGAM'
## The following object is masked from 'package:arm':
##
## logit
## The following object is masked from 'package:lmtest':
##
## lrtest
## The following object is masked from 'package:tidyr':
##
## fill
## The following object is masked from 'package:VIM':
##
## wine
## The following object is masked from 'package:car':
##
## logit
## The following objects are masked from 'package:psych':
##
## fisherz, logistic, logit
## Loading required package: rpart
## Loading required package: randomForest
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:psych':
##
## outlier
## The following object is masked from 'package:ggplot2':
##
## margin
## Important regclass change from 1.3:
## All functions that had a . in the name now have an _
## all.correlations -> all_correlations, cor.demo -> cor_demo, etc.
##
## Attaching package: 'regclass'
## The following object is masked from 'package:DescTools':
##
## VIF
library(olsrr)
##
## Attaching package: 'olsrr'
## The following object is masked from 'package:MASS':
##
## cement
## The following object is masked from 'package:datasets':
##
## rivers
library(dplyr)
library(broom)
##
## Attaching package: 'broom'
## The following object is masked from 'package:sjstats':
##
## bootstrap
library(ggpubr)
#Dimension Reduction
library(REdaS)
library(Hmisc)
## Loading required package: lattice
##
## Attaching package: 'lattice'
## The following object is masked from 'package:regclass':
##
## qq
## The following object is masked from 'package:userfriendlyscience':
##
## oneway
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:DescTools':
##
## %nin%, Label, Mean, Quantile
## The following object is masked from 'package:userfriendlyscience':
##
## escapeRegex
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following object is masked from 'package:psych':
##
## describe
## The following objects are masked from 'package:base':
##
## format.pval, units
library(corrplot)
## corrplot 0.84 loaded
##
## Attaching package: 'corrplot'
## The following object is masked from 'package:arm':
##
## corrplot
library(ggcorrplot)
##
## Attaching package: 'ggcorrplot'
## The following object is masked from 'package:rstatix':
##
## cor_pmat
library(factoextra) #Used for principal component analysis to get a different view of eigenvalues
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(nFactors)
##
## Attaching package: 'nFactors'
## The following object is masked from 'package:lattice':
##
## parallel
library("readxl")
data_acperfor <- read_excel("data_academic_performance.xlsx", sheet = "SABER11_SABERPRO")
colnames(data_acperfor) <- tolower(colnames(data_acperfor))
head(data_acperfor)
## # A tibble: 6 x 44
## cod_s11 gender edu_father edu_mother occ_father occ_mother stratum sisben
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 SB1120… F Incomplet… Complete … Technical… Home Stratu… It is…
## 2 SB1120… F Complete … Complete … Entrepren… Independe… Stratu… It is…
## 3 SB1120… M Not sure Not sure Independe… Home Stratu… Level…
## 4 SB1120… F Not sure Not sure Other occ… Independe… Stratu… It is…
## 5 SB1120… M Complete … Complete … Executive Home Stratu… It is…
## 6 SB1120… F Complete … Complete … Independe… Executive Stratu… It is…
## # … with 36 more variables: people_house <chr>, internet <chr>, tv <chr>,
## # computer <chr>, washing_mch <chr>, mic_oven <chr>, car <chr>, dvd <chr>,
## # fresh <chr>, phone <chr>, mobile <chr>, revenue <chr>, job <chr>,
## # school_name <chr>, school_nat <chr>, school_type <chr>, mat_s11 <dbl>,
## # cr_s11 <dbl>, cc_s11 <dbl>, bio_s11 <dbl>, eng_s11 <dbl>, cod_spro <chr>,
## # university <chr>, academic_program <chr>, qr_pro <dbl>, cr_pro <dbl>,
## # cc_pro <dbl>, eng_pro <dbl>, wc_pro <dbl>, fep_pro <dbl>, g_sc <dbl>,
## # percentile <dbl>, `2nd_decile` <dbl>, quartile <dbl>, sel <dbl>,
## # sel_ihe <dbl>
summary(data_acperfor)
## cod_s11 gender edu_father edu_mother
## Length:12411 Length:12411 Length:12411 Length:12411
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## occ_father occ_mother stratum sisben
## Length:12411 Length:12411 Length:12411 Length:12411
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## people_house internet tv computer
## Length:12411 Length:12411 Length:12411 Length:12411
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## washing_mch mic_oven car dvd
## Length:12411 Length:12411 Length:12411 Length:12411
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## fresh phone mobile revenue
## Length:12411 Length:12411 Length:12411 Length:12411
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## job school_name school_nat school_type
## Length:12411 Length:12411 Length:12411 Length:12411
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## mat_s11 cr_s11 cc_s11 bio_s11
## Min. : 26.00 Min. : 24.00 Min. : 0.00 Min. : 11.00
## 1st Qu.: 56.00 1st Qu.: 54.00 1st Qu.: 54.00 1st Qu.: 56.00
## Median : 64.00 Median : 61.00 Median : 60.00 Median : 64.00
## Mean : 64.32 Mean : 60.78 Mean : 60.71 Mean : 63.95
## 3rd Qu.: 72.00 3rd Qu.: 67.00 3rd Qu.: 67.00 3rd Qu.: 71.00
## Max. :100.00 Max. :100.00 Max. :100.00 Max. :100.00
## eng_s11 cod_spro university academic_program
## Min. : 26.0 Length:12411 Length:12411 Length:12411
## 1st Qu.: 50.0 Class :character Class :character Class :character
## Median : 59.0 Mode :character Mode :character Mode :character
## Mean : 61.8
## 3rd Qu.: 72.0
## Max. :100.0
## qr_pro cr_pro cc_pro eng_pro
## Min. : 1.00 Min. : 1.0 Min. : 1.00 Min. : 1.0
## 1st Qu.: 65.00 1st Qu.: 42.0 1st Qu.: 36.00 1st Qu.: 51.0
## Median : 85.00 Median : 67.0 Median : 65.00 Median : 74.0
## Mean : 77.42 Mean : 62.2 Mean : 59.19 Mean : 67.5
## 3rd Qu.: 96.00 3rd Qu.: 86.0 3rd Qu.: 85.00 3rd Qu.: 88.0
## Max. :100.00 Max. :100.0 Max. :100.00 Max. :100.0
## wc_pro fep_pro g_sc percentile
## Min. : 0.0 Min. : 1.0 Min. : 37.0 Min. : 1.00
## 1st Qu.: 28.0 1st Qu.:124.0 1st Qu.:147.0 1st Qu.: 51.00
## Median : 56.0 Median :153.0 Median :163.0 Median : 75.00
## Mean : 53.7 Mean :145.5 Mean :162.7 Mean : 68.45
## 3rd Qu.: 80.0 3rd Qu.:174.0 3rd Qu.:179.0 3rd Qu.: 90.00
## Max. :100.0 Max. :300.0 Max. :247.0 Max. :100.00
## 2nd_decile quartile sel sel_ihe
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:3.000 1st Qu.:3.000 1st Qu.:2.000 1st Qu.:2.000
## Median :4.000 Median :4.000 Median :2.000 Median :2.000
## Mean :3.886 Mean :3.189 Mean :2.599 Mean :2.409
## 3rd Qu.:5.000 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:3.000
## Max. :5.000 Max. :4.000 Max. :4.000 Max. :4.000
What is a relationship for those male and female engineering students between the overall average score of the professional evaluation with formulation of engineering projects and mathematics in Colombia.
Dependent variable: g_sc: the overall average score of the professional evaluation
Independent variables: 1. fep_pro: Formulation of Engineering Projects 2. mat_s11 (Mathematics: only one subject for the two periods, check whether it impacts the global scores g_sc.) 3. gender (binary: F/M) —- before need to check whether differences
*** add interaction term to explore more: fep_pro*mat_s11 as interaction variable for model 3
# statistics descpritve
# g_sc summary statistics
pastecs::stat.desc(data_acperfor$g_sc, basic=F)
## median mean SE.mean CI.mean.0.95 var std.dev
## 163.0000000 162.7104988 0.2074642 0.4066620 534.1866899 23.1124791
## coef.var
## 0.1420466
summary(data_acperfor$g_sc)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 37.0 147.0 163.0 162.7 179.0 247.0
# Analyze Normality --- Dependent variable: g_sc
gg_gsc <- ggplot(data_acperfor, aes(x=g_sc)) +
labs(x="g_sc") +
ggtitle("Figure 1 - Histogram for Normalised g_sc") +
geom_histogram(binwidth=2, colour="black", aes(y=..density.., fill=..count..)) +
scale_fill_gradient("Count", low="#DCDCDC", high="#7C7C7C") +
stat_function(fun=dnorm, color="red",args=list(mean=mean(data_acperfor$g_sc, na.rm=TRUE), sd=sd(data_acperfor$g_sc, na.rm=TRUE)))
gg_gsc
qqnorm(data_acperfor$g_sc, main="Figure 2 - QQ Plot for Normalised g_sc")
qqline(data_acperfor$g_sc, col=2)
#skewness and kurtosis
tpskew <- semTools::skew(data_acperfor$g_sc)
tpkurt <- semTools::kurtosis(data_acperfor$g_sc)
tpskew[1]/tpskew[2]
## skew (g1)
## -4.339492
tpkurt[1]/tpkurt[2]
## Excess Kur (g2)
## -1.714035
gsc<- abs(scale(data_acperfor$g_sc))
FSA::perc(as.numeric(gsc), 1.96, "gt")
## [1] 4.060914
FSA::perc(as.numeric(gsc), 3.29, "gt")
## [1] 0.15309
# statistics descpritve
# mat_s11: summary statistics
pastecs::stat.desc(data_acperfor$mat_s11, basic=F)
## median mean SE.mean CI.mean.0.95 var std.dev
## 64.0000000 64.3207638 0.1065813 0.2089158 140.9835648 11.8736500
## coef.var
## 0.1846006
summary(data_acperfor$mat_s11)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 26.00 56.00 64.00 64.32 72.00 100.00
# Analyze Normality --- independent variable: mat_s11
gg_fegpro <- ggplot(data_acperfor, aes(x=mat_s11)) +
labs(x="mat_s11") +
ggtitle("Figure 3 - Histogram for Normalised mat_s11") +
geom_histogram(binwidth=2, colour="black", aes(y=..density.., fill=..count..)) +
scale_fill_gradient("Count", low="#DCDCDC", high="#7C7C7C") +
stat_function(fun=dnorm, color="red",args=list(mean=mean(data_acperfor$mat_s11, na.rm=TRUE), sd=sd(data_acperfor$mat_s11, na.rm=TRUE)))
gg_fegpro
qqnorm(data_acperfor$mat_s11, main="Figure 4 - QQ Plot for Normalised mat_s11")
qqline(data_acperfor$mat_s11, col=2)
#skewness and kurtosis
tpskew <- semTools::skew(data_acperfor$mat_s11)
tpkurt <- semTools::kurtosis(data_acperfor$mat_s11)
tpskew[1]/tpskew[2]
## skew (g1)
## 18.17215
tpkurt[1]/tpkurt[2]
## Excess Kur (g2)
## 2.954064
mats11<- abs(scale(data_acperfor$mat_s11))
FSA::perc(as.numeric(mats11), 1.96, "gt")
## [1] 4.447667
FSA::perc(as.numeric(mats11), 3.29, "gt")
## [1] 0
# statistics descpritve
# fep_pro summary statistics
pastecs::stat.desc(data_acperfor$fep_pro, basic=F)
## median mean SE.mean CI.mean.0.95 var std.dev
## 153.0000000 145.4765933 0.3601859 0.7060202 1610.1268292 40.1263857
## coef.var
## 0.2758271
summary(data_acperfor$fep_pro)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.0 124.0 153.0 145.5 174.0 300.0
# Analyze Normality --- independent variable: fep_pro
gg_fegpro <- ggplot(data_acperfor, aes(x=fep_pro)) +
labs(x="fep_pro") +
ggtitle("Figure 5 - Histogram for Normalised fep_pro") +
geom_histogram(binwidth=2, colour="black", aes(y=..density.., fill=..count..)) +
scale_fill_gradient("Count", low="#DCDCDC", high="#7C7C7C") +
stat_function(fun=dnorm, color="red",args=list(mean=mean(data_acperfor$fep_pro, na.rm=TRUE), sd=sd(data_acperfor$fep_pro, na.rm=TRUE)))
gg_fegpro
qqnorm(data_acperfor$fep_pro, main="Figure 6 - QQ Plot for Normalised fep_pro")
qqline(data_acperfor$fep_pro, col=2)
#skewness and kurtosis
tpskew <- semTools::skew(data_acperfor$fep_pro)
tpkurt <- semTools::kurtosis(data_acperfor$fep_pro)
tpskew[1]/tpskew[2]
## skew (g1)
## -38.2767
tpkurt[1]/tpkurt[2]
## Excess Kur (g2)
## 16.28851
feppro<- abs(scale(data_acperfor$fep_pro))
FSA::perc(as.numeric(feppro), 1.96, "gt")
## [1] 4.632987
FSA::perc(as.numeric(feppro), 3.29, "gt")
## [1] 0.5076142
# checking whether data is balance, gender
summary(as.factor(data_acperfor$gender))
## F M
## 5043 7368
#for female variable
female_var = table(data_acperfor$gender)["F"]/(table(data_acperfor$gender)["F"]+table(data_acperfor$gender)["M"])
male_var = table(data_acperfor$gender)["M"]/(table(data_acperfor$gender)["F"]+table(data_acperfor$gender)["M"])
print(female_var) #0.41
## F
## 0.4063331
print(male_var)#0.59
## M
## 0.5936669
# missing data
#1. visualise the messing data level and pattern
varsint <- c("fep_pro", "mat_s11", "g_sc", "gender")
acperfor_subset <- data_acperfor[varsint]
summary(acperfor_subset)
## fep_pro mat_s11 g_sc gender
## Min. : 1.0 Min. : 26.00 Min. : 37.0 Length:12411
## 1st Qu.:124.0 1st Qu.: 56.00 1st Qu.:147.0 Class :character
## Median :153.0 Median : 64.00 Median :163.0 Mode :character
## Mean :145.5 Mean : 64.32 Mean :162.7
## 3rd Qu.:174.0 3rd Qu.: 72.00 3rd Qu.:179.0
## Max. :300.0 Max. :100.00 Max. :247.0
#Create and inspect patterns of missingness
res<-summary(VIM::aggr(acperfor_subset, sortVar=TRUE))$combinations
##
## Variables sorted by number of missings:
## Variable Count
## fep_pro 0
## mat_s11 0
## g_sc 0
## gender 0
### result: no missing data
# correlation between fep_pro and g_sc
# show scatterplot of g_sc (y) and fep_pro (x)
scat_fepsc <- ggplot2::ggplot(data_acperfor, aes(fep_pro, g_sc))
#Add a regression line
scat_fepsc + geom_point() + geom_smooth(method = "lm", colour = "Red", se = F) + labs(x = "fep_pro", y = "Normalised g_sc")
## `geom_smooth()` using formula 'y ~ x'
#Pearson Correlation
stats::cor.test(data_acperfor$fep_pro, data_acperfor$g_sc, method='pearson')
##
## Pearson's product-moment correlation
##
## data: data_acperfor$fep_pro and data_acperfor$g_sc
## t = 44.872, df = 12409, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.3584055 0.3886814
## sample estimates:
## cor
## 0.373643
####################################################################################################
# correlation between mat_s11 and g_sc
#show scatterplot of g_sc (y) and mat_s11 (x)
scat_fepsc <- ggplot2::ggplot(data_acperfor, aes(mat_s11, g_sc))
#Add a regression line
scat_fepsc + geom_point() + geom_smooth(method = "lm", colour = "Red", se = F) + labs(x = "mat_s11", y = "Normalised g_sc")
## `geom_smooth()` using formula 'y ~ x'
#Pearson Correlation
stats::cor.test(data_acperfor$mat_s11, data_acperfor$g_sc, method='pearson')
##
## Pearson's product-moment correlation
##
## data: data_acperfor$mat_s11 and data_acperfor$g_sc
## t = 93.733, df = 12409, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.6334194 0.6540231
## sample estimates:
## cor
## 0.6438379
notes: 1. correlation with fep_pro and g_sc is weak positive correlation (r = 0.37) 2. correlation with mat_s11 and g_sc is moderate positive correlation (r=0.64)
# checking the difference g_sc by gender
# add gender to investigate a differential effect
# independent t-test
psych::describeBy(data_acperfor$g_sc, data_acperfor$gender, mat=TRUE)
## item group1 vars n mean sd median trimmed mad min max
## X11 1 F 1 5043 161.2782 22.33294 162 161.4994 23.7216 76 242
## X12 2 M 1 7368 163.6908 23.58262 164 163.9561 25.2042 37 247
## range skew kurtosis se
## X11 166 -0.07216008 -0.24717975 0.3144861
## X12 210 -0.12210492 0.01441897 0.2747370
# Levene's test for homogeneity of variable
car::leveneTest(g_sc ~ gender, data=data_acperfor)
## Warning in leveneTest.default(y = y, group = group, ...): group coerced to
## factor.
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 1 13.115 0.000294 ***
## 12409
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# t test
res <- stats::t.test(g_sc ~ gender, var.equal=TRUE, data=data_acperfor)
res
##
## Two Sample t-test
##
## data: g_sc by gender
## t = -5.7189, df = 12409, p-value = 1.097e-08
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -3.239544 -1.585691
## sample estimates:
## mean in group F mean in group M
## 161.2782 163.6908
# Cohen's d
# 0.2=small effect, 0.5=moderate, 0.8 = large
effsize_gender = round((2*res$statistic)/sqrt(res$parameter), 2)
effsize_gender
## t
## -0.1
effectsize::t_to_d(t=res$statistic, res$parameter)
## d | 95% CI
## ----------------------
## -0.10 | [-0.14, -0.07]
# Eta
# reporting guideline: on effect size: 0.01 = small, 0.06 = moderate, 0.14 =large
effes=round((res$statistic*res$statistic)/((res$statistic*res$statistic)+(res$parameter)),3)
effes
## t
## 0.003
Report this t-test result:
1. Reporting the results with Cohen’s d effect
An independent-samples t-test was conducted to the overall average score of the professional evaluation (g_sc) for female and male students. There is a slight Significant difference in the scores for the overall average score of the professional evaluation was found (M=161.28, SD=22.33 for female students, M=163.69, SD=23.58 for male students), t(12409)=-5.72, p-value < 0.001). Cohen's d also indicated a small effect size (-0.10).
2. Reporting the results with eta squared effect
An independent-samples t-test was conducted to compare to the overall average score of the professional evaluation (g_sc) for female and male students. There is a slight Significant difference in the scores for the overall average score of the professional evaluation was found (M=161.28, SD=22.33 for female students, M=163.69, SD=23.58 for male students), t(12409)=-5.72, p-value < 0.001). A small effect size was also indicated by the eta squared value (0.003).
# MLR_Model1
# independent variable: fep_pro, mat_s11
# dependent variable: g_sc
mmodel_1 <- lm(data_acperfor$g_sc ~ data_acperfor$fep_pro + data_acperfor$mat_s11)
anova(mmodel_1)
## Analysis of Variance Table
##
## Response: data_acperfor$g_sc
## Df Sum Sq Mean Sq F value Pr(>F)
## data_acperfor$fep_pro 1 925504 925504 3206.9 < 2.2e-16 ***
## data_acperfor$mat_s11 1 2122802 2122802 7355.5 < 2.2e-16 ***
## Residuals 12408 3580950 289
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cat("\n *******Summary mulit-model*******\n")
##
## *******Summary mulit-model*******
summary(mmodel_1)
##
## Call:
## lm(formula = data_acperfor$g_sc ~ data_acperfor$fep_pro + data_acperfor$mat_s11)
##
## Residuals:
## Min 1Q Median 3Q Max
## -147.225 -10.994 0.436 11.235 78.513
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 70.836419 0.909734 77.86 <2e-16 ***
## data_acperfor$fep_pro 0.127002 0.003937 32.26 <2e-16 ***
## data_acperfor$mat_s11 1.141128 0.013305 85.76 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16.99 on 12408 degrees of freedom
## Multiple R-squared: 0.4598, Adjusted R-squared: 0.4597
## F-statistic: 5281 on 2 and 12408 DF, p-value: < 2.2e-16
lm.beta::lm.beta(mmodel_1)
##
## Call:
## lm(formula = data_acperfor$g_sc ~ data_acperfor$fep_pro + data_acperfor$mat_s11)
##
## Standardized Coefficients::
## (Intercept) data_acperfor$fep_pro data_acperfor$mat_s11
## 0.0000000 0.2204928 0.5862357
stargazer(mmodel_1, type="text")
##
## ================================================
## Dependent variable:
## ----------------------------
## g_sc
## ------------------------------------------------
## fep_pro 0.127***
## (0.004)
##
## mat_s11 1.141***
## (0.013)
##
## Constant 70.836***
## (0.910)
##
## ------------------------------------------------
## Observations 12,411
## R2 0.460
## Adjusted R2 0.460
## Residual Std. Error 16.988 (df = 12408)
## F Statistic 5,281.195*** (df = 2; 12408)
## ================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
plotg_sc <- ggplot(data_acperfor, aes(x=mat_s11, y=g_sc)) +
theme_bw() +
geom_point(colour="grey") +
geom_line(data=plotting.data, aes(x=mat_s11, y=predicted.y, colour=fep_pro), size=1) +
geom_smooth(method = "lm", colour = "red", se = F) + labs(x = "mat_s11", y = "g_sc") +
labs(title = "Model 1 Global Score \n as a function of Mathmatics \n and the score of formulation of Engineering Projects",
x = "Mathmatics",
y = "The average score of the professional evaluation",
color = "fep_pro")
plotg_sc
## `geom_smooth()` using formula 'y ~ x'
#dummy code gender to be 0 and 1 as we want by adding a new variable gender to the dataset which recodes gender
data_acperfor$sex=ifelse(data_acperfor$gender == "M", 0, ifelse(data_acperfor$gender == "F", 1, NA))
# R automatically recodes categorical to be dummy variable 0 = reference (females), 1 category of interest (males)
mmodel_2 <- lm(data_acperfor$g_sc ~ data_acperfor$fep_pro + data_acperfor$mat_s11 +data_acperfor$sex)
cat("\n *******Summary mulit-model*******\n")
##
## *******Summary mulit-model*******
summary(mmodel_2)
##
## Call:
## lm(formula = data_acperfor$g_sc ~ data_acperfor$fep_pro + data_acperfor$mat_s11 +
## data_acperfor$sex)
##
## Residuals:
## Min 1Q Median 3Q Max
## -146.526 -10.944 0.421 11.343 77.319
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 69.004330 0.942286 73.231 < 2e-16 ***
## data_acperfor$fep_pro 0.125806 0.003932 31.992 < 2e-16 ***
## data_acperfor$mat_s11 1.157898 0.013477 85.915 < 2e-16 ***
## data_acperfor$sex 2.282554 0.314493 7.258 4.17e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16.95 on 12407 degrees of freedom
## Multiple R-squared: 0.4621, Adjusted R-squared: 0.462
## F-statistic: 3553 on 3 and 12407 DF, p-value: < 2.2e-16
anova(mmodel_2)
## Analysis of Variance Table
##
## Response: data_acperfor$g_sc
## Df Sum Sq Mean Sq F value Pr(>F)
## data_acperfor$fep_pro 1 925504 925504 3220.231 < 2.2e-16 ***
## data_acperfor$mat_s11 1 2122802 2122802 7386.150 < 2.2e-16 ***
## data_acperfor$sex 1 15140 15140 52.677 4.166e-13 ***
## Residuals 12407 3565811 287
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lm.beta::lm.beta(mmodel_2)
##
## Call:
## lm(formula = data_acperfor$g_sc ~ data_acperfor$fep_pro + data_acperfor$mat_s11 +
## data_acperfor$sex)
##
## Standardized Coefficients::
## (Intercept) data_acperfor$fep_pro data_acperfor$mat_s11
## 0.00000000 0.21841594 0.59485077
## data_acperfor$sex
## 0.04850701
stargazer(mmodel_2, type="text")
##
## ================================================
## Dependent variable:
## ----------------------------
## g_sc
## ------------------------------------------------
## fep_pro 0.126***
## (0.004)
##
## mat_s11 1.158***
## (0.013)
##
## sex 2.283***
## (0.314)
##
## Constant 69.004***
## (0.942)
##
## ------------------------------------------------
## Observations 12,411
## R2 0.462
## Adjusted R2 0.462
## Residual Std. Error 16.953 (df = 12407)
## F Statistic 3,553.019*** (df = 3; 12407)
## ================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
plotg_sc2 <- ggplot(data_acperfor, aes(x=mat_s11, y=g_sc)) +
theme_bw() +
geom_point(colour="grey") +
geom_line(data=plotting.data2, aes(x=mat_s11, y=predicted.y, colour=sex), size=1.25) +
geom_smooth(method = "lm", colour = "red", se = F) + labs(x = "mat_s11", y = "g_sc") +
labs(title = "Model 2 Global Score \n as a function of Mathmatics \n and the score of formulation of Engineering Projects by gender",
x = "Mathmatics",
y = "The average score of the professional evaluation",
color = "Gender")
plotg_sc2
## `geom_smooth()` using formula 'y ~ x'
stargazer(mmodel_1, mmodel_2, type="text")
##
## =============================================================================
## Dependent variable:
## ---------------------------------------------------------
## g_sc
## (1) (2)
## -----------------------------------------------------------------------------
## fep_pro 0.127*** 0.126***
## (0.004) (0.004)
##
## mat_s11 1.141*** 1.158***
## (0.013) (0.013)
##
## sex 2.283***
## (0.314)
##
## Constant 70.836*** 69.004***
## (0.910) (0.942)
##
## -----------------------------------------------------------------------------
## Observations 12,411 12,411
## R2 0.460 0.462
## Adjusted R2 0.460 0.462
## Residual Std. Error 16.988 (df = 12408) 16.953 (df = 12407)
## F Statistic 5,281.195*** (df = 2; 12408) 3,553.019*** (df = 3; 12407)
## =============================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
# investigating partial correlation: two variables while controlling for another
varsmodel <- c("g_sc", "sex", "fep_pro", "mat_s11")
omitdata <- na.omit(data_acperfor[varsmodel])
ppcor::spcor.test(omitdata$g_sc, omitdata$mat_s11, omitdata$fep_pro)
## estimate p.value statistic n gp Method
## 1 0.5658774 0 76.45188 12411 1 pearson
#zero order correlations
cor(omitdata$g_sc, omitdata$fep_pro)
## [1] 0.373643
cor(omitdata$g_sc, omitdata$mat_s11)
## [1] 0.6438379
cor(omitdata$mat_s11, omitdata$fep_pro)
## [1] 0.2612434
Reporting Results of partical correlation
Partial correlation was used to explore the relationship between the global score (as measured by g_sc) and mathmatics while controlling for scores on formulation of engineering projects (fep_pro).Preliminary analyses were performed to ensure no violation of the assumption of normality, linearity and homoscedasticity. There was a stong, positive partial correlatin between, the global scores, mathmatics and the scores on formulation of engineering projects, (r=0.57, n=12,411, p<0.001). An inspection of the zero-order correlation (r=0.26) suggested that the scores on formulation of engineering projects had very little effect on the strength of the relationship between these two variables.
# add interaction item
data_acperfor$interaction <- data_acperfor$fep_pro * data_acperfor$mat_s11
mmodel_3 <- lm(omitdata$g_sc ~ omitdata$sex + omitdata$mat_s11 + data_acperfor$interaction)
summary(mmodel_3)
##
## Call:
## lm(formula = omitdata$g_sc ~ omitdata$sex + omitdata$mat_s11 +
## data_acperfor$interaction)
##
## Residuals:
## Min 1Q Median 3Q Max
## -146.748 -10.995 0.406 11.360 78.377
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.686e+01 9.147e-01 94.962 < 2e-16 ***
## omitdata$sex 2.332e+00 3.159e-01 7.382 1.66e-13 ***
## omitdata$mat_s11 9.064e-01 1.787e-02 50.729 < 2e-16 ***
## data_acperfor$interaction 1.751e-03 5.843e-05 29.975 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.03 on 12407 degrees of freedom
## Multiple R-squared: 0.4571, Adjusted R-squared: 0.4569
## F-statistic: 3481 on 3 and 12407 DF, p-value: < 2.2e-16
anova(mmodel_3)
## Analysis of Variance Table
##
## Response: omitdata$g_sc
## Df Sum Sq Mean Sq F value Pr(>F)
## omitdata$sex 1 17426 17426 60.07 9.867e-15 ***
## omitdata$mat_s11 1 2751869 2751869 9485.83 < 2.2e-16 ***
## data_acperfor$interaction 1 260653 260653 898.49 < 2.2e-16 ***
## Residuals 12407 3599308 290
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lm.beta::lm.beta(mmodel_3)
##
## Call:
## lm(formula = omitdata$g_sc ~ omitdata$sex + omitdata$mat_s11 +
## data_acperfor$interaction)
##
## Standardized Coefficients::
## (Intercept) omitdata$sex omitdata$mat_s11
## 0.00000000 0.04956147 0.46562876
## data_acperfor$interaction
## 0.27229709
stargazer(mmodel_3, type="text")
##
## ================================================
## Dependent variable:
## ----------------------------
## g_sc
## ------------------------------------------------
## sex 2.332***
## (0.316)
##
## mat_s11 0.906***
## (0.018)
##
## interaction 0.002***
## (0.0001)
##
## Constant 86.858***
## (0.915)
##
## ------------------------------------------------
## Observations 12,411
## R2 0.457
## Adjusted R2 0.457
## Residual Std. Error 17.032 (df = 12407)
## F Statistic 3,481.463*** (df = 3; 12407)
## ================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
plotg_sc3 <- ggplot(data_acperfor, aes(x=mat_s11, y=g_sc)) +
theme_bw() +
geom_point(colour="grey") +
geom_line(data=plotting.data3, aes(x=mat_s11, y=predicted.y, colour=sex), size=1) +
geom_smooth(method = "lm", colour = "red", se = F) + labs(x = "mat_s11", y = "g_sc") +
labs(title = "Model 3 Global Score \n as a function of Mathmatics \n and the score of formulation of engineering projects \n with interaction by gender",
x = "Mathmatics",
y = "The average score of the professional evaluation",
color = "Gender")
plotg_sc3
## `geom_smooth()` using formula 'y ~ x'
stargazer(mmodel_1, mmodel_2, mmodel_3, type="text")
##
## ==========================================================================================================
## Dependent variable:
## --------------------------------------------------------------------------------------
## g_sc g_sc
## (1) (2) (3)
## ----------------------------------------------------------------------------------------------------------
## fep_pro 0.127*** 0.126***
## (0.004) (0.004)
##
## mat_s11 1.141*** 1.158***
## (0.013) (0.013)
##
## sex 2.283***
## (0.314)
##
## sex 2.332***
## (0.316)
##
## mat_s11 0.906***
## (0.018)
##
## interaction 0.002***
## (0.0001)
##
## Constant 70.836*** 69.004*** 86.858***
## (0.910) (0.942) (0.915)
##
## ----------------------------------------------------------------------------------------------------------
## Observations 12,411 12,411 12,411
## R2 0.460 0.462 0.457
## Adjusted R2 0.460 0.462 0.457
## Residual Std. Error 16.988 (df = 12408) 16.953 (df = 12407) 17.032 (df = 12407)
## F Statistic 5,281.195*** (df = 2; 12408) 3,553.019*** (df = 3; 12407) 3,481.463*** (df = 3; 12407)
## ==========================================================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
stdres_mmolde1 = rstandard(mmodel_1)
summary(stdres_mmolde1)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -8.667304 -0.647218 0.025666 0.000007 0.661424 4.622050
pastecs::stat.desc(stdres_mmolde1, basic=F)
## median mean SE.mean CI.mean.0.95 var std.dev
## 2.566595e-02 7.426034e-06 8.976692e-03 1.759571e-02 1.000091e+00 1.000045e+00
## coef.var
## 1.346675e+05
#Influential observatons
# An observation that changes the slope of the line.
# They have a large influence on the fit of the model.
#influntial Outlier - cook's distance
cooksd_mod1 <- sort(cooks.distance(mmodel_1))
#plot Cook's distance, show the values that are greater than one
plot(cooksd_mod1, pch="*", cex=2, main="Influential observations by Cooks distance for Model 1")
abline(h=4*mean(cooksd_mod1, na.rm=T), col="red") #cutoff line
text(x=1:length(cooksd_mod1)+1, y=cooksd_mod1, labels=ifelse(cooksd_mod1>4*mean(cooksd_mod1, na.rom=T), names(cooksd_mod1), ""), col="red")
#find rows related to influential observations
influential_model1 <- as.numeric(names(cooksd_mod1)[(cooksd_mod1 > 4*mean(cooksd_mod1, na.rm=T))]) # influential row numbers
stem(influential_model1)
##
## The decimal point is 3 digit(s) to the right of the |
##
## 0 | 00011111222233444555666777888899
## 1 | 0011111111122233333334444555556677788899999
## 2 | 00000000011111111111111122222223333333444455555555666666677777778888
## 3 | 0011111222333344444455555566667777778888888899999
## 4 | 00011112223333333444455556667777888899999
## 5 | 0000011112222223333333333333344445555666666666777788888899999
## 6 | 00000011111111222222222333344444444455555555566677777777888888888999
## 7 | 000000011111222223344444444555666677777778888888999999
## 8 | 000001112222233333344444445555555677778888899
## 9 | 000000011122222222333333333344444445556666666667777778888999
## 10 | 000000011111333344445555666666667778899999
## 11 | 00000111122222222223333344444555555666677788888899999
## 12 | 000111111111122222333333333444444
head(data_acperfor[influential_model1, ]) # influential observations.
## # A tibble: 6 x 46
## cod_s11 gender edu_father edu_mother occ_father occ_mother stratum sisben
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 SB1120… F Complete … Complete … Auxiliary… Auxiliary… Stratu… It is…
## 2 SB1120… F Complete … Complete … 0 Technical… Stratu… It is…
## 3 SB1120… M Complete … Incomplet… Independe… Home Stratu… Level…
## 4 SB1120… M Complete … Complete … Other occ… Auxiliary… Stratu… Level…
## 5 SB1120… M Complete … Complete … Technical… Home Stratu… It is…
## 6 SB1120… F Incomplet… Incomplet… Operator Home Stratu… It is…
## # … with 38 more variables: people_house <chr>, internet <chr>, tv <chr>,
## # computer <chr>, washing_mch <chr>, mic_oven <chr>, car <chr>, dvd <chr>,
## # fresh <chr>, phone <chr>, mobile <chr>, revenue <chr>, job <chr>,
## # school_name <chr>, school_nat <chr>, school_type <chr>, mat_s11 <dbl>,
## # cr_s11 <dbl>, cc_s11 <dbl>, bio_s11 <dbl>, eng_s11 <dbl>, cod_spro <chr>,
## # university <chr>, academic_program <chr>, qr_pro <dbl>, cr_pro <dbl>,
## # cc_pro <dbl>, eng_pro <dbl>, wc_pro <dbl>, fep_pro <dbl>, g_sc <dbl>,
## # percentile <dbl>, `2nd_decile` <dbl>, quartile <dbl>, sel <dbl>,
## # sel_ihe <dbl>, sex <dbl>, interaction <dbl>
head(data_acperfor[influential_model1, ]$fep_pro) # look at the values of fep_pro
## [1] 211 221 79 210 145 93
head(data_acperfor[influential_model1, ]$mat_s11)
## [1] 82 94 65 69 100 70
car::outlierTest(mmodel_1)
## rstudent unadjusted p-value Bonferroni p
## 1336 -8.693310 3.9597e-18 4.9144e-14
## 7721 -5.687573 1.3177e-08 1.6354e-04
## 6920 4.625847 3.7684e-06 4.6770e-02
# Bonferonni p-value for most extreme obs
# Are there any cases where the outcome variable has an unusual variable for its predictor values?
Report outliers for model 1: An analysis of standard residuals was carried out on the data to identify any outliers, which indicated three cases for concern which were deleted.
# leverage points: An observation that has a value of x that is far away from the mean of x
car::leveragePlots(mmodel_1)
#Assess homoscedasticity: ZRESID vs ZPRED
plot(mmodel_1,1)
#from the homoscedasticity plot: assumptions met
plot(mmodel_1,3)
# Normality of residuals
# create histogram and density plot of the residuals
plot(density(resid(mmodel_1)))
#create QQ plot for standanside residuals
car::qqPlot(mmodel_1, main="Normal QQ plot of Regression Standardized Residual for mmodel_1")
## [1] 1336 7721
Report for Random Normal Distribution of errors for model 1:
The histogram of standardized residuals indicated that the data contained normally distributed errors, as did the normal Q-Q plot of standardized residuals, which showed most points were extremely close to the line.
# non-zero variances
#pastecs::stat.desc(mmodel_3, basic=F)
varsmodel <- c("fep_pro", "mat_s11")
model_predictor <- data_acperfor[varsmodel]
pastecs::stat.desc(model_predictor, basic=F)
## fep_pro mat_s11
## median 153.0000000 64.0000000
## mean 145.4765933 64.3207638
## SE.mean 0.3601859 0.1065813
## CI.mean.0.95 0.7060202 0.2089158
## var 1610.1268292 140.9835648
## std.dev 40.1263857 11.8736500
## coef.var 0.2758271 0.1846006
Non-Zero Report for model 1: The data also met the assumption of non-zero variances (Formulation of Engineering Projects = 1610.13; Mathematics = 140.98)
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Collinearity Note: Occurs when two or more independent variables contain strongly redundant information. If variables are collinear then it means there is not enough distinct information in these variables for MLR to operate – they are essentially measuring the same thing. if we conduct MLR with collinear variables then the model will produce invalid results Need to check for collinearity by examining a correlation matrix that compares your independent variables with each other.
#Collinearity: above 0.8: might be present.
# vif > 2.5 or tolenance < 0.4 might be multicollinearity.
vif_mm1 <- car::vif(mmodel_1)
vif_mm1
## data_acperfor$fep_pro data_acperfor$mat_s11
## 1.073247 1.073247
#Calculate tolerance
1/vif_mm1
## data_acperfor$fep_pro data_acperfor$mat_s11
## 0.9317519 0.9317519
the result show no Multicollinearity. vif: 1.07 < 2.5, tolerance 0.93 > 0.4
Report for colliearity for model 1 Tests to see if the data met the assumption of collinearity indicated that multicollinearity was not a concert(Formulation of Engineering Projects, Tolerance=0.93, VIF=1.07; Mathematics, Tolerance=0.93, VIF=1.07) +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
(dummy code, F-1, M-0)
stdres_mmolde2 = rstandard(mmodel_2)
summary(stdres_mmolde2)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -8.644256 -0.645643 0.024830 0.000008 0.669164 4.561478
pastecs::stat.desc(stdres_mmolde2, basic=F)
## median mean SE.mean CI.mean.0.95 var std.dev
## 2.482968e-02 7.639327e-06 8.976679e-03 1.759568e-02 1.000088e+00 1.000044e+00
## coef.var
## 1.309073e+05
if standardized residual = [-3.29, 3.29] then no outliers, otherwise, there are outliers [-8.64, 4.56]
#Influential observatons
# An observation that changes the slope of the line.
# They have a large influence on the fit of the model.
#influntial Outlier - cook's distance
cooksd_mod2 <- sort(cooks.distance(mmodel_2))
#plot Cook's distance, show the values that are greater than one
plot(cooksd_mod2, pch="*", cex=2, main="Influential observations by Cooks distance for Model 2")
abline(h=4*mean(cooksd_mod2, na.rm=T), col="red") #cutoff line
text(x=1:length(cooksd_mod2)+1, y=cooksd_mod2, labels=ifelse(cooksd_mod2>4*mean(cooksd_mod2, na.rom=T), names(cooksd_mod2), ""), col="red")
#find rows related to influential observations
influential_model2 <- as.numeric(names(cooksd_mod2)[(cooksd_mod2 > 4*mean(cooksd_mod2, na.rm=T))]) # influential row numbers
stem(influential_model2)
##
## The decimal point is 3 digit(s) to the right of the |
##
## 0 | 00111112222333444555666677788888899
## 1 | 001111111112223333333344444555556677788899999
## 2 | 00000000000011111111111111122222233333333444455555566666667777778888
## 3 | 000111112233333444444455556666777777788888888999
## 4 | 0001111222333333444455556667777888899999
## 5 | 00001111222223333333333333444455555666666667777788888889999
## 6 | 0000001111111222222222333344444455555556677777777788888888999999
## 7 | 0000001111122222334444444445566667777778888899999
## 8 | 00001112222333334444444555555556777888889999
## 9 | 000000011112222222233333333344444555666666666677777778888999
## 10 | 00000001111333334444455555666666666777788899999
## 11 | 000001112222222223333344444555555566667788888999
## 12 | 0001111111112222222333333334444444
head(data_acperfor[influential_model2, ]) # influential observations.
## # A tibble: 6 x 46
## cod_s11 gender edu_father edu_mother occ_father occ_mother stratum sisben
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 SB1120… F Incomplet… Incomplet… 0 0 Stratu… Esta …
## 2 SB1120… M Postgradu… Postgradu… Independe… Technical… Stratu… It is…
## 3 SB1120… M Complete … Complete … Executive Home Stratu… It is…
## 4 SB1120… F Complete … Complete … Operator Independe… Stratu… Level…
## 5 SB1120… M Complete … Complete … Technical… Home Stratu… It is…
## 6 SB1120… M Complete … Complete … Technical… Home Stratu… It is…
## # … with 38 more variables: people_house <chr>, internet <chr>, tv <chr>,
## # computer <chr>, washing_mch <chr>, mic_oven <chr>, car <chr>, dvd <chr>,
## # fresh <chr>, phone <chr>, mobile <chr>, revenue <chr>, job <chr>,
## # school_name <chr>, school_nat <chr>, school_type <chr>, mat_s11 <dbl>,
## # cr_s11 <dbl>, cc_s11 <dbl>, bio_s11 <dbl>, eng_s11 <dbl>, cod_spro <chr>,
## # university <chr>, academic_program <chr>, qr_pro <dbl>, cr_pro <dbl>,
## # cc_pro <dbl>, eng_pro <dbl>, wc_pro <dbl>, fep_pro <dbl>, g_sc <dbl>,
## # percentile <dbl>, `2nd_decile` <dbl>, quartile <dbl>, sel <dbl>,
## # sel_ihe <dbl>, sex <dbl>, interaction <dbl>
head(data_acperfor[influential_model2, ]$fep_pro) # look at the values of fep_pro
## [1] 147 210 206 74 98 92
head(data_acperfor[influential_model2, ]$mat_s11)
## [1] 52 100 68 76 61 53
head(data_acperfor[influential_model2, ]$sex)
## [1] 1 0 0 1 0 0
car::outlierTest(mmodel_2)
## rstudent unadjusted p-value Bonferroni p
## 1336 -8.670056 4.8523e-18 6.0222e-14
## 7721 -5.652291 1.6181e-08 2.0083e-04
# Bonferonni p-value for most extreme obs
# Are there any cases where the outcome variable has an unusual variable for its predictor values?
Report outliers for model 2: An analysis of standard residuals was carried out on the data to identify any outliers, which indicated two cases for concern which were deleted.
# leverage points: An observation that has a value of x that is far away from the mean of x
car::leveragePlots(mmodel_2)
#Assess homocedasticity: ZRESID y vs ZPRED x
# ZPRED (the standardized predicted values of the dependent variable based on the model). These values are
# standardized forms of the values predicted by the model.
# ZRESID (the standardized residuals, or errors). These values are the standardized differences between the
# observed data and the values that the model predicts).
plot(mmodel_2,1)
#from the homcedasticity plot: assumptions met
plot(mmodel_2,3)
# Normality of residuals
# create histogram and density plot of the residuals
plot(density(resid(mmodel_2)))
#create QQ plot for standanside residuals
car::qqPlot(mmodel_2, main="Normal QQ plot \n of Regression Standardized Residual for mmodel_2")
## [1] 1336 7721
Report for Random Normal Distribution of errors for model 2: The histogram of standardized residuals indicated that the data contained normally distributed errors, as did the normal Q-Q plot of standardized residuals, which showed most points were extremely close to the line.
# non-zero variances
varsmodel <- c("fep_pro", "mat_s11","sex")
model_predictor <- data_acperfor[varsmodel]
pastecs::stat.desc(model_predictor, basic=F)
## fep_pro mat_s11 sex
## median 153.0000000 64.0000000 0.000000000
## mean 145.4765933 64.3207638 0.406333092
## SE.mean 0.3601859 0.1065813 0.004408863
## CI.mean.0.95 0.7060202 0.2089158 0.008642056
## var 1610.1268292 140.9835648 0.241245948
## std.dev 40.1263857 11.8736500 0.491167943
## coef.var 0.2758271 0.1846006 1.208781547
Non-Zero Report for model 2: The data also met the assumption of non-zero variances (Formulation of Engineering Projects = 1610.13; Mathematics = 140.98, sex=0.24)
#Collinearity: above 0.8: might be present.
# vif > 2.5 or tolenance < 0.4 might be multicollinearity.
vif_mm2 <- car::vif(mmodel_2)
vif_mm2
## data_acperfor$fep_pro data_acperfor$mat_s11 data_acperfor$sex
## 1.075136 1.105746 1.030295
#Calculate tolerance
1/vif_mm2
## data_acperfor$fep_pro data_acperfor$mat_s11 data_acperfor$sex
## 0.9301151 0.9043665 0.9705962
the result show no Multicollinearity. vif: 1.08, 1.11, 1.03 < 2.5, tolerance 0.93, 0.90, 0.97 > 0.4
Report for colliearity of model 2: Tests to see if the data met the assumption of collinearity indicated that multicollinearity was not a concert(Formulation of Engineering Projects, Tolerance=0.93, VIF=1.08; Mathematics, Tolerance=0.90, VIF=1.11; Sex, Tolerance=0.97, VIF=1.03). +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
stdres_mmolde3 = rstandard(mmodel_3)
summary(stdres_mmolde3)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -8.616957 -0.645600 0.023838 0.000005 0.667170 4.602219
pastecs::stat.desc(stdres_mmolde3, basic=F)
## median mean SE.mean CI.mean.0.95 var std.dev
## 2.383789e-02 5.266550e-06 8.976672e-03 1.759567e-02 1.000086e+00 1.000043e+00
## coef.var
## 1.898858e+05
if standardized residual = [-3.29, 3.29] then no outliers, otherwise, there are outliers [-8.62, 4.60]
#Influential observatons
# An observation that changes the slope of the line.
# They have a large influence on the fit of the model.
#influntial Outlier - cook's distance
cooksd_mod3 <- sort(cooks.distance(mmodel_3))
#plot Cook's distance, show the values that are greater than one
plot(cooksd_mod3, pch="*", cex=2, main="Influnential observations by Cooks distance for Model 2")
abline(h=4*mean(cooksd_mod3, na.rm=T), col="red") #cutoff line
text(x=1:length(cooksd_mod3)+1, y=cooksd_mod3, labels=ifelse(cooksd_mod3>4*mean(cooksd_mod3, na.rom=T), names(cooksd_mod3), ""), col="red")
#find rows related to influential observations
influential_model3 <- as.numeric(names(cooksd_mod3)[(cooksd_mod3 > 4*mean(cooksd_mod3, na.rm=T))]) # influential row numbers
stem(influential_model3)
##
## The decimal point is 3 digit(s) to the right of the |
##
## 0 | 001111122233344455566667788888899
## 1 | 0011111111122233333334444455555667788899999
## 2 | 00000000001111111111111222223333333344444555555666666777778888888899
## 3 | 00111122333334444455555667777777888888889999
## 4 | 0000111122233333444455566677777888899999
## 5 | 00000111122222233333333333334444555556666666677778888888899999
## 6 | 000000111111222222222333444445555555667777777777888888888999999
## 7 | 0000000111112222233444444455666677777778889999
## 8 | 000011122222333344444455555556778888889999
## 9 | 000000011122222222333333334444455566666666677777778888999
## 10 | 0000000111113334444555566666666677778889999
## 11 | 000011112222222222333344444555555566667788888899999
## 12 | 0000111111111122222233333334444444
head(data_acperfor[influential_model3, ]) # influential observations.
## # A tibble: 6 x 46
## cod_s11 gender edu_father edu_mother occ_father occ_mother stratum sisben
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 SB1120… M Complete … Complete … Operator Operator Stratu… It is…
## 2 SB1120… M Complete … Complete … Operator Operator Stratu… Level…
## 3 SB1120… M Incomplet… Complete … Independe… Operator Stratu… Level…
## 4 SB1120… M Incomplet… Complete … Independe… Operator Stratu… It is…
## 5 SB1120… M Complete … Complete … Technical… Home Stratu… It is…
## 6 SB1120… F Incomplet… Complete … Operator Operator Stratu… Level…
## # … with 38 more variables: people_house <chr>, internet <chr>, tv <chr>,
## # computer <chr>, washing_mch <chr>, mic_oven <chr>, car <chr>, dvd <chr>,
## # fresh <chr>, phone <chr>, mobile <chr>, revenue <chr>, job <chr>,
## # school_name <chr>, school_nat <chr>, school_type <chr>, mat_s11 <dbl>,
## # cr_s11 <dbl>, cc_s11 <dbl>, bio_s11 <dbl>, eng_s11 <dbl>, cod_spro <chr>,
## # university <chr>, academic_program <chr>, qr_pro <dbl>, cr_pro <dbl>,
## # cc_pro <dbl>, eng_pro <dbl>, wc_pro <dbl>, fep_pro <dbl>, g_sc <dbl>,
## # percentile <dbl>, `2nd_decile` <dbl>, quartile <dbl>, sel <dbl>,
## # sel_ihe <dbl>, sex <dbl>, interaction <dbl>
head(data_acperfor[influential_model3, ]$mat_s11)
## [1] 66 47 61 65 55 54
head(data_acperfor[influential_model3, ]$sex)
## [1] 0 0 0 0 0 1
head(data_acperfor[influential_model3, ]$interaction)
## [1] 9174 4230 3355 12935 5170 7506
car::outlierTest(mmodel_3)
## rstudent unadjusted p-value Bonferroni p
## 1336 -8.642510 6.1691e-18 7.6565e-14
## 7721 -5.614364 2.0152e-08 2.5011e-04
# Bonferonni p-value for most extreme obs
# Are there any cases where the outcome variable has an unusual variable for its predictor values?
Report outliers for model 3: An analysis of standard residuals was carried out on the data to identify any outliers, which indicated two cases for concern which were deleted.
# leverage points: An observation that has a value of x that is far away from the mean of x
car::leveragePlots(mmodel_3)
#Assess homocedasticity: ZRESID y vs ZPRED x
# ZPRED (the standardized predicted values of the dependent variable based on the model). These values are
# standardized forms of the values predicted by the model.
# ZRESID (the standardized residuals, or errors). These values are the standardized differences between the
# observed data and the values that the model predicts).
plot(mmodel_3,1)
#from the homcedasticity plot: assumptions met
plot(mmodel_3,3)
# Normality of residuals
# create histogram and density plot of the residuals
plot(density(resid(mmodel_3)))
#create QQ plot for standanside residuals
car::qqPlot(mmodel_3, main="Normal QQ plot of Regression Standardized Residual for mmodel_3")
## [1] 1336 7721
Report for Random Normal Distribution of errors for model 3: The histogram of standardized residuals indicated that the data contained normally distributed errors, as did the normal Q-Q plot of standardized residuals, which showed most points were extremely close to the line.
# non-zero variances
varsmodel <- c("interaction", "mat_s11","sex")
model_predictor <- data_acperfor[varsmodel]
pastecs::stat.desc(model_predictor, basic=F)
## interaction mat_s11 sex
## median 9.238000e+03 64.0000000 0.000000000
## mean 9.481624e+03 64.3207638 0.406333092
## SE.mean 3.225355e+01 0.1065813 0.004408863
## CI.mean.0.95 6.322196e+01 0.2089158 0.008642056
## var 1.291105e+07 140.9835648 0.241245948
## std.dev 3.593196e+03 11.8736500 0.491167943
## coef.var 3.789641e-01 0.1846006 1.208781547
Non-Zero Report for model 3: The data also met the assumption of non-zero variances (Interaction = 1.29e+07 ; Mathematics = 140.98, sex=0.24)
#Collinearity: above 0.8: might be present.
# vif > 2.5 or tolenance < 0.4 might be multicollinearity.
vif_mm3 <- car::vif(mmodel_3)
vif_mm3
## omitdata$sex omitdata$mat_s11 data_acperfor$interaction
## 1.030075 1.925229 1.885766
#Calculate tolerance
1/vif_mm3
## omitdata$sex omitdata$mat_s11 data_acperfor$interaction
## 0.9708033 0.5194188 0.5302886
the result show no Multicollinearity. vif: 1.03, 1.93, 1.89 < 2.5, tolerance 0.97, 0.52, 0.53 > 0.4
Report for colliearity of model 3: Tests to see if the data met the assumption of collinearity indicated that multicollinearity was not a concert(Interaction, Tolerance=0.53, VIF=1.89; Mathematics, Tolerance=0.52, VIF=1.93; Sex, Tolerance=0.97, VIF=1.03).